In [1]:
import graphlab
In [2]:
#limit number of worker processes to 4
graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 4)
In [3]:
#set canvas to open inline
graphlab.canvas.set_target('ipynb')
In [4]:
sales = graphlab.SFrame('home_data.gl/')
In the notebook we covered in the module, we discovered which neighborhood (zip code) of Seattle had the highest average house sale price. Now, take the sales data, select only the houses with this zip code, and compute the average price. Save this result to answer the quiz at the end.
In [5]:
highest_avg_price_zipcode = '98039'
In [7]:
sales_zipcode = sales[sales['zipcode'] == highest_avg_price_zipcode]
In [10]:
avg_price_highest_zipcode = sales_zipcode['price'].mean()
In [11]:
print avg_price_highest_zipcode
In [12]:
total_houses = sales.num_rows()
In [13]:
print total_houses
In [17]:
filtered_houses = sales[(sales['sqft_living'] > 2000) & (sales['sqft_living'] <= 4000)]
In [18]:
print filtered_houses.num_rows()
In [23]:
filtered_houses = sales[sales.apply(lambda x: (x['sqft_living'] > 2000) & (x['sqft_living'] <= 4000))]
In [24]:
print filtered_houses.num_rows()
In [27]:
total_filtered_houses = filtered_houses.num_rows()
In [28]:
print total_filtered_houses
In [33]:
filtered_houses_fraction = total_filtered_houses / float(total_houses)
In [34]:
print filtered_houses_fraction
In [36]:
advanced_features = [
'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house
'grade', # measure of quality of construction
'waterfront', # waterfront property
'view', # type of view
'sqft_above', # square feet above ground
'sqft_basement', # square feet in basement
'yr_built', # the year built
'yr_renovated', # the year renovated
'lat', 'long', # the lat-long of the parcel
'sqft_living15', # average sq.ft. of 15 nearest neighbors
'sqft_lot15', # average lot size of 15 nearest neighbors
]
In [37]:
print advanced_features
In [38]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']
In [39]:
train_data, test_data = sales.random_split(.8, seed=0)
In [40]:
my_feature_model = graphlab.linear_regression.create(train_data, target='price', features=my_features, validation_set=None)
In [41]:
print my_feature_model.evaluate(test_data)
In [43]:
print test_data['price'].mean()
In [44]:
advanced_feature_model = graphlab.linear_regression.create(train_data, target='price', features=advanced_features, validation_set=None)
In [45]:
print advanced_feature_model.evaluate(test_data)
In [47]:
print my_feature_model.evaluate(test_data)['rmse'] - advanced_feature_model.evaluate(test_data)['rmse']
In [ ]: